library(haven)
library(dplyr)
library(tibble)
library(data.table)
library(stargazer)
library(sandwich)
library(lmtest) 

`%notin%`=Negate(`%in%`)

# Import the census waves 
cens1991 <- read_dta("H:/Zheng_10223/Joint/Census/census1991_occdata.dta")
cens1996 <- read_dta("H:/Zheng_10223/Joint/Census/census1996_occdata.dta")
cens2001 <- read_dta("H:/Zheng_10223/Joint/Census/census2001_occdata.dta")
cens2006 <- read_dta("H:/Zheng_10223/Joint/Census/census2006_occdata.dta")
cens2011 <- read_dta("H:/Zheng_10223/Joint/Census/census2011_occdata.dta")
allcens <- rbind(cens1991, cens1996, cens2001,cens2006,cens2011); rm(cens1991, cens1996, cens2001,cens2011,cens2006)

# cpi 
cpi <- read.csv("H:/Zheng_10223/Joint/cpi/cpi.csv")
allcens <- merge(x=allcens, y=cpi, by.x="year", by.y="year", all.x=T)
allcens$totinc_real <- allcens$totinc*cpi$cpi[which(cpi$year==2020)]/allcens$cpi
allcens$hhinc_real <- allcens$hhinc*cpi$cpi[which(cpi$year==2020)]/allcens$cpi

# drop less than zero income: 
allcens=allcens %>% filter(hhinc_real>=0)


# winsorize:
allcens$totinc_real[which(allcens$totinc_real>=quantile(allcens$totinc_real, 0.999, na.rm=T))] <- quantile(allcens$totinc_real, 0.999, na.rm=T)
allcens$hhinc_real[which(allcens$hhinc_real>=quantile(allcens$hhinc_real, 0.999, na.rm=T))] <- quantile(allcens$hhinc_real, 0.999, na.rm=T)


# create schooling bins for the Census data
allcens$SchoolingBins=NA
allcens$SchoolingBins[which(allcens$totyrs<12)] <- "<12"
allcens$SchoolingBins[which(allcens$totyrs==12)] <- "12"
allcens$SchoolingBins[which(allcens$totyrs>12 & allcens$totyrs<=15)] <- "12-15"
allcens$SchoolingBins[which(allcens$totyrs==16)] <- "16"
allcens$SchoolingBins[which(allcens$totyrs>16 & allcens$totyrs<99)] <- ">16"


# cma CODE IS COMBINATION of province and cma -> pr + cma 
allcens$cmastring=allcens$cma 
allcens$cmastring[nchar(allcens$cmastring)==1]=paste("00",allcens$cma[nchar(allcens$cmastring)==1],sep="")
allcens$cmastring[nchar(allcens$cmastring)==2]=paste("0",allcens$cma[nchar(allcens$cmastring)==2],sep="")

allcens$cma1=paste(allcens$pr,allcens$cmastring,sep="")


# in the allcens: 1 is female; 2 is male 

# opposite in imdb
# recode
allcens$sexnew <- NA
allcens$sexnew[allcens$sex==1] <- 2 # recode females to 2 = matches imdb
allcens$sexnew[allcens$sex==2] <- 1 # recode males to 1 = matches imdb

allcens$sex <- NULL; 
allcens <- allcens %>% rename(sex=sexnew)


allcens$DESTINATION_CMA <- "Other"
allcens$DESTINATION_CMA[allcens$cma1=="35535"]="Toronto"
allcens$DESTINATION_CMA[allcens$cma1=="24462"]="Montreal"
allcens$DESTINATION_CMA[allcens$cma1=="59933"]="Vancouver"
allcens$DESTINATION_CMA[allcens$cma1=="48825"]="Calgary"
allcens$DESTINATION_CMA[allcens$cma1=="48835"]="Edmonton"

# relevel occupation:
allcens$noc2011 <- factor(allcens$noc2011)
allcens$noc2011 <- relevel(allcens$noc2011, ref="86")

# clean occupation: 
allcens$noc2011[allcens$noc2011=="0"]="00"

#######################################################################################################################################################

# Import the cohort file 
dfcohort <- fread("H:/Zheng_10223/Joint/cohort_2025.csv")

# Unique...
dfcohort <- dfcohort[order(IMDB_ID, -Child_Income_IND_30_34_pct, -MainParent_Income_HH_MainParentAge45_49)]
dfcohort <- unique(dfcohort, by="IMDB_ID")

# convert these back to match the allcens
dfcohort$DESTINATION_CMA[which(dfcohort$DESTINATION_CMA=="Toronto")] <- "35535"
dfcohort$DESTINATION_CMA[which(dfcohort$DESTINATION_CMA=="Montreal")] <- "24462"
dfcohort$DESTINATION_CMA[which(dfcohort$DESTINATION_CMA=="Vancouver")] <- "59933"
dfcohort$DESTINATION_CMA[which(dfcohort$DESTINATION_CMA=="Calgary")] <- "48825"
dfcohort$DESTINATION_CMA[which(dfcohort$DESTINATION_CMA=="Edmonton")] <- "48835"

dfcohort$year <- NA
dfcohort$year[dfcohort$MainParentYearat45<1996] <- 1991 
dfcohort$year[dfcohort$MainParentYearat45>=1996 & dfcohort$MainParentYearat45<2001] <- 1996 
dfcohort$year[dfcohort$MainParentYearat45>=2001 & dfcohort$MainParentYearat45<2006] <- 2001
dfcohort$year[dfcohort$MainParentYearat45>=2006 & dfcohort$MainParentYearat45<2011] <- 2006
dfcohort$year[dfcohort$MainParentYearat45>2011] <- 2011


dfcohort$noc2011 <- dfcohort$NOC2_CD11_Main
dfcohort$sex <- dfcohort$gender_MainParent
dfcohort$pr <- dfcohort$DESTINATION_PROVINCE
####################################

# use the marital status upon landing:  single is not married all else is previously married or currently married
dfcohort$married <- ifelse(dfcohort$MARITAL_STATUS_ROLLUP_main %in% c(1,3,9),0,1)

##########################################

dfcohort$SchoolingBins <- dfcohort$SchoolingBins_Main

######################################

# clean management
dfcohort$noc2011[dfcohort$NOC2_CD11_Main=="-" & dfcohort$SKILL_LEVEL_CD11_Main=="0"] <- "06"  # these are managers give them "06" retail managers: lowest level of managers

# CALCULATE LOWEST LEVEL OCCUPATION 
lowocc=allcens %>% group_by(noc2011) %>% summarize(meaninc=mean(hhinc_real, na.rm=TRUE))

# check education level of new workers
educ_new_workers=dfcohort %>% filter(SKILL_LEVEL_CD11_Main %in% c("N","-","99")) %>% group_by(SchoolingBins) %>% summarize(count=n())

# for new workers and non workers and missing and not stated: give them lowest occupation level - these people have low education levels
dfcohort$occ_imputeflag=0
dfcohort$occ_imputeflag[dfcohort$SKILL_LEVEL_CD11_Main %in% c("N", "O", "S", "R", "-","99")]=1

dfcohort$noc2011[dfcohort$SKILL_LEVEL_CD11_Main %in% c("N", "O", "S", "R", "-","99")]="86"  # these are new workers and non workers , give them lowest value occupation, labourers

dfcohort$noc2011[dfcohort$noc2011 %in% c("99","-")]=86
dfcohort$occ_imputeflag[dfcohort$noc2011 %in% c("99","-")]=1

dfcohort$SchoolingBins[dfcohort$SchoolingBins==""]="<12"
dfcohort=dfcohort[dfcohort$sex!=3,]

dfcohort$noc2011=factor(dfcohort$noc2011)
dfcohort$noc2011=relevel(dfcohort$noc2011, ref="86")


#############################################
#####      RUN PREDICTION EXERCISE      #####
#############################################

allcens$loghhinc=log(allcens$hhinc_real + 1)

### Fit prediction model:
# Model 1: no CMA (as before in first submission)
# Model 2: with CMA
# Model 3: For new workers, don't use intended occupation
# Model 4: Prediction using those that arrive early (before age of 30)
model1 <- lm(loghhinc ~ factor(noc2011)*factor(sex)+ factor(SchoolingBins)*factor(sex) + factor(year) + factor(sex)+factor(married), weights=person_weight, data=allcens)
model2 <- lm(loghhinc ~ factor(noc2011)*factor(sex)+ factor(SchoolingBins)*factor(sex) + factor(year) + factor(sex)+factor(married) + factor(cma1), weights=person_weight, data=allcens)
model3 <- lm(loghhinc ~ factor(sex)+ factor(SchoolingBins)*factor(sex) + factor(year) + factor(sex)+factor(married), weights=person_weight, data=allcens)
model4 <- lm(MainParent_Income_HH_MainParentAge45_49_pctparent ~ factor(noc2011)*factor(sex)+ factor(SchoolingBins)*factor(sex) + factor(year) + factor(sex)+factor(married), data=dfcohort[LandingageMain<=30])

dfcohort$logMainParent45_49=log(dfcohort$MainParent_Income_HH_MainParentAge45_49+1)
model5 <- lm(logMainParent45_49 ~ factor(noc2011)*factor(sex)+ factor(SchoolingBins)*factor(sex) + factor(year) + factor(sex)+factor(married), data=dfcohort[LandingageMain<=30])

# Income percentiles from the LAD (at year-age-level)
cpi <- read.csv("H:/Zheng_10223/Joint/cpi/cpi.csv")
income_pct <- read.csv("H:/Zheng_10223/Joint/LAD/finaldist.csv", header=TRUE, stringsAsFactors = FALSE)
income_pct <- merge(x=income_pct, y=cpi, by.x="year", by.y="year", all.x=T)
income_pct$pctindTIRC_real <- income_pct$pctindTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pcthhTIRC_real <- income_pct$pcthhTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctindearn_real <- income_pct$pctindearn*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctparentTIRC_real <- income_pct$pctparentTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$logpctparentTIRC_real <- log(income_pct$pctparentTIRC_real +1 )

income_pct <- data.table(income_pct)
income_pct <- income_pct[order(income_pct$year, income_pct$age, income_pct$pct),]

##### Predict income

### Calculate income rank of predicted log income against all Canadians
dfcohort$PredictedLogIncome_pct_model1 <- NA
dfcohort$PredictedLogIncome_pct_model2 <- NA
dfcohort$PredictedLogIncome_pct_model3 <- NA
dfcohort$PredictedIncome_pct_model4 <- NA
dfcohort$PredictedLogIncome_pct_model5 <- NA

### Model 1: no CMA (as before in first submission)
dfcohort_impute <- dfcohort[noc2011 %in% allcens$noc2011 & sex %in% allcens$sex & SchoolingBins %in% allcens$SchoolingBins]
dfcohort_impute$PredictedLogIncome_model1 <- predict(model1, newdata=dfcohort_impute)
dfcohort_impute$exp_PredictedLogIncome_model1=exp(dfcohort_impute$PredictedLogIncome_model1)
dfcohort <- merge(dfcohort, dfcohort_impute[,c("IMDB_ID","PredictedLogIncome_model1","exp_PredictedLogIncome_model1")],by=c("IMDB_ID"), all.x=TRUE)

# Get pct ranking
for(j in 1982:2019){
  dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45==j)] <- findInterval(dfcohort$exp_PredictedLogIncome_model1[which(dfcohort$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45==j  & dfcohort$PredictedLogIncome_pct_model1==min(dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45==j & dfcohort$PredictedLogIncome_pct_model1==min(dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45==j )], na.rm=T))])
}

# For parents who turn 45 before 1982, use 1982:
dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45<1982)] <- findInterval(dfcohort$exp_PredictedLogIncome_model1[which(dfcohort$MainParentYearat45<1982)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==1982 & income_pct$age==45)])
dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45<1982 & dfcohort$PredictedLogIncome_pct_model1==min(dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))] <- 0.5*(1+min(dfcohort$PredictedLogIncome_pct_model1[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))


### Model 2: with CMA
dfcohort_impute <- dfcohort[noc2011 %in% allcens$noc2011 & sex %in% allcens$sex & SchoolingBins %in% allcens$SchoolingBins & DESTINATION_CMA %in% allcens$cma1]
setnames(dfcohort_impute, old="DESTINATION_CMA", new="cma1")
dfcohort_impute$PredictedLogIncome_model2 <- predict(model2, newdata=dfcohort_impute)
dfcohort_impute$exp_PredictedLogIncome_model2=exp(dfcohort_impute$PredictedLogIncome_model2)
dfcohort <- merge(dfcohort,dfcohort_impute[,c("IMDB_ID","PredictedLogIncome_model2","exp_PredictedLogIncome_model2")],by=c("IMDB_ID"), all.x=TRUE)

# Get pct ranking
for(j in 1982:2019){
  dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45==j)] <- findInterval(dfcohort$exp_PredictedLogIncome_model2[which(dfcohort$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45==j  & dfcohort$PredictedLogIncome_pct_model2==min(dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45==j & dfcohort$PredictedLogIncome_pct_model2==min(dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45==j )], na.rm=T))])
}

# For parents who turn 45 before 1982, use 1982:
dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45<1982)] <- findInterval(dfcohort$exp_PredictedLogIncome_model2[which(dfcohort$MainParentYearat45<1982)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==1982 & income_pct$age==45)])
dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45<1982 & dfcohort$PredictedLogIncome_pct_model2==min(dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))] <- 0.5*(1+min(dfcohort$PredictedLogIncome_pct_model2[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))


### Model 3: For new workers, don't use intended occupation
notnewworker <- dfcohort[occ_imputeflag==0]
newworker <- dfcohort[occ_imputeflag==1]

# use model 1 for non-imputed
notnewworker <- notnewworker[noc2011 %in% allcens$noc2011 & sex %in% allcens$sex & SchoolingBins %in% allcens$SchoolingBins]
notnewworker$PredictedLogIncome_model3 <- predict(model1, newdata=notnewworker)
notnewworker$exp_PredictedLogIncome_model3=exp(notnewworker$PredictedLogIncome_model3)

# use model 3 for new workers (imputed)
newworker <- newworker[sex %in% allcens$sex & SchoolingBins %in% allcens$SchoolingBins]
newworker$PredictedLogIncome_model3 <- predict(model3, newdata=newworker)
newworker$exp_PredictedLogIncome_model3=exp(newworker$PredictedLogIncome_model3)

dfcohort_impute <- rbind(notnewworker[,c("IMDB_ID", "PredictedLogIncome_model3", "exp_PredictedLogIncome_model3")],
                         newworker[,c("IMDB_ID", "PredictedLogIncome_model3", "exp_PredictedLogIncome_model3")])
dfcohort <- merge(dfcohort, dfcohort_impute, by.x="IMDB_ID", by.y="IMDB_ID", all.x=TRUE)

# Get pct ranking
for(j in 1982:2019){
  dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45==j)] <- findInterval(dfcohort$exp_PredictedLogIncome_model3[which(dfcohort$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45==j  & dfcohort$PredictedLogIncome_pct_model3==min(dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45==j & dfcohort$PredictedLogIncome_pct_model3==min(dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45==j )], na.rm=T))])
}

# For parents who turn 45 before 1982, use 1982:
dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45<1982)] <- findInterval(dfcohort$exp_PredictedLogIncome_model3[which(dfcohort$MainParentYearat45<1982)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==1982 & income_pct$age==45)])
dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45<1982 & dfcohort$PredictedLogIncome_pct_model3==min(dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))] <- 0.5*(1+min(dfcohort$PredictedLogIncome_pct_model3[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))


### Model 4: Prediction using those that arrive early
dfcohort_impute <- dfcohort[noc2011 %in% allcens$noc2011 & sex %in% allcens$sex & SchoolingBins %in% allcens$SchoolingBins]
dfcohort_impute$PredictedIncome_model4 <- predict(model4, newdata=dfcohort_impute)
dfcohort <- merge(dfcohort,dfcohort_impute[,c("IMDB_ID","PredictedIncome_model4")],by=c("IMDB_ID"), all.x=TRUE)

# Get pct ranking
for(j in 1982:2019){
  dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45==j)] <- findInterval(dfcohort$PredictedIncome_model4[which(dfcohort$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45==j & dfcohort$PredictedIncome_pct_model4==min(dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45==j)], na.rm=T))] <-0.5*(1+min(dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45==j )], na.rm=T))
}

# For parents who turn 45 before 1982, use 1982:
dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45<1982)] <- findInterval(dfcohort$PredictedIncome_model4[which(dfcohort$MainParentYearat45<1982)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==1982 & income_pct$age==45)])
dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45<1982 & dfcohort$PredictedIncome_pct_model4==min(dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))] <- 0.5*(1+min(dfcohort$PredictedIncome_pct_model4[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))

# Use actual income for those before 30
dfcohort$PredictedIncome_pct_model4[which(dfcohort$LandingageMain<=30)] <- dfcohort$MainParent_Income_HH_MainParentAge45_49_pctparent[which(dfcohort$LandingageMain<=30)]

### Model 5: Prediction using those that arrive early ; but using logs
dfcohort_impute <- dfcohort[noc2011 %in% allcens$noc2011 & sex %in% allcens$sex & SchoolingBins %in% allcens$SchoolingBins]

dfcohort_impute$PredictedLogIncome_model5 <- predict(model5, newdata=dfcohort_impute)
dfcohort_impute$exp_PredictedLogIncome_model5 =exp(dfcohort_impute$PredictedLogIncome_model5)

dfcohort <- merge(dfcohort,dfcohort_impute[,c("IMDB_ID","exp_PredictedLogIncome_model5")],by=c("IMDB_ID"), all.x=TRUE)

# Get pct ranking
for(j in 1982:2019){
  dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45==j)] <- findInterval(dfcohort$exp_PredictedLogIncome_model5[which(dfcohort$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45==j  & dfcohort$PredictedLogIncome_pct_model5==min(dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45==j & dfcohort$PredictedLogIncome_pct_model5==min(dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45==j )], na.rm=T))])
}

# For parents who turn 45 before 1982, use 1982:
dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45<1982)] <- findInterval(dfcohort$exp_PredictedLogIncome_model5[which(dfcohort$MainParentYearat45<1982)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==1982 & income_pct$age==45)])
dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45<1982 & dfcohort$PredictedLogIncome_pct_model5==min(dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))] <- 0.5*(1+min(dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$MainParentYearat45<=1982)], na.rm=T))


# Use actual income for those before 30
dfcohort$PredictedLogIncome_pct_model5[which(dfcohort$LandingageMain<=30)] <- dfcohort$MainParent_Income_HH_MainParentAge45_49_pctparent[which(dfcohort$LandingageMain<=30)]

##################################################
####   Get new Rank-Rank (Predicted Income)   ####
##################################################

realized_refugee <- lm(Child_Income_IND_30_34_pct~MainParent_Income_HH_MainParentAge45_49_pctparent, data=dfcohort[Status_Refugee==1])
RR_model1_refugee <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model1, data=dfcohort[Status_Refugee==1])
RR_model2_refugee <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model2, data=dfcohort[Status_Refugee==1])
RR_model3_refugee <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model3, data=dfcohort[Status_Refugee==1])
RR_model4_refugee <- lm(Child_Income_IND_30_34_pct~PredictedIncome_pct_model4, data=dfcohort[Status_Refugee==1])
RR_model5_refugee <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model5, data=dfcohort[Status_Refugee==1])

realized_non <- lm(Child_Income_IND_30_34_pct~MainParent_Income_HH_MainParentAge45_49_pctparent, data=dfcohort[Status_Refugee==0])
RR_model1_non <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model1, data=dfcohort[Status_Refugee==0])
RR_model2_non <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model2, data=dfcohort[Status_Refugee==0])
RR_model3_non <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model3, data=dfcohort[Status_Refugee==0])
RR_model4_non <- lm(Child_Income_IND_30_34_pct~PredictedIncome_pct_model4, data=dfcohort[Status_Refugee==0])

RR_model5_non <- lm(Child_Income_IND_30_34_pct~PredictedLogIncome_pct_model5, data=dfcohort[Status_Refugee==0])

#########################################################
#####     Murphy-Topel Corrected Standard Errors    #####
#########################################################

# Simplifies greatly because we only have one endogenous variable (parental income)

# First stage is the Census prediction regression
# Second stage is rank rank on fitted parental income

Get_ValidVCov <- function(first_stage, second_stage){
  n1 <- nobs(second_stage)
  n2 <- nobs(first_stage)
  beta_TW2sls <- matrix(coef(second_stage),nrow=1, ncol=2)
  Sigma_eta <- mean(residuals(first_stage)^2)
  sigma_oneone <- mean(residuals(second_stage)^2)
  
  
  MT_adj <- 1+(1+(n1/n2)*beta_TW2sls%*%t(beta_TW2sls)*Sigma_eta/sigma_oneone)
  Valid_VCOV <- as.numeric(MT_adj)*vcov(second_stage)
  return(Valid_VCOV)
}

##### OUTPUT

# Coefficients
View(t(as.matrix(c(coef(realized_refugee)[1],coef(RR_model1_refugee)[1],coef(RR_model2_refugee)[1],coef(RR_model3_refugee)[1],coef(RR_model4_refugee)[1],coef(RR_model5_refugee)[1],
                   coef(realized_non)[1],coef(RR_model1_non)[1],coef(RR_model2_non)[1],coef(RR_model3_non)[1],coef(RR_model4_non)[1],coef(RR_model5_non)[1]))))


t1=(as.data.frame(c(coef(realized_refugee)[1],coef(RR_model1_refugee)[1],coef(RR_model2_refugee)[1],coef(RR_model3_refugee)[1],coef(RR_model4_refugee)[1],coef(RR_model5_refugee)[1],
                 coef(realized_non)[1],coef(RR_model1_non)[1],coef(RR_model2_non)[1],coef(RR_model3_non)[1],coef(RR_model4_non)[1],coef(RR_model5_non)[1])))

write.csv(t1,"H:/Zheng_10223/ToVet/Output/IncomePredictioncoef1.csv")


View(t(as.matrix(c(coef(realized_refugee)[2],coef(RR_model1_refugee)[2],coef(RR_model2_refugee)[2],coef(RR_model3_refugee)[2],coef(RR_model4_refugee)[2],coef(RR_model5_refugee)[2],
                   coef(realized_non)[2],coef(RR_model1_non)[2],coef(RR_model2_non)[2],coef(RR_model3_non)[2],coef(RR_model4_non)[2],coef(RR_model5_non)[2]))))


t2=as.data.frame(c(coef(realized_refugee)[2],coef(RR_model1_refugee)[2],coef(RR_model2_refugee)[2],coef(RR_model3_refugee)[2],coef(RR_model4_refugee)[2],coef(RR_model5_refugee)[2],
               coef(realized_non)[2],coef(RR_model1_non)[2],coef(RR_model2_non)[2],coef(RR_model3_non)[2],coef(RR_model4_non)[2],coef(RR_model5_non)[2]))

write.csv(t2,"H:/Zheng_10223/ToVet/Output/IncomePredictioncoef2.csv")



# Valid (Murphy-Topel adjusted) standard errors

View(t(as.matrix(c(sqrt(vcov(realized_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_refugee)[1,1]), 
                  
                  sqrt(vcov(realized_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_non)[1,1]),
                  sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_non)[1,1])
                  
                  ))))

se1=(as.data.frame(c(sqrt(vcov(realized_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_refugee)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_refugee)[1,1]), 
                  
                  sqrt(vcov(realized_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_non)[1,1]), 
                  sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_non)[1,1]),
                  sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_non)[1,1])
                  
)))



View(t(as.matrix(c(sqrt(vcov(realized_refugee)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_refugee)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_refugee)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_refugee)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_refugee)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_refugee)[2,2]),
                   sqrt(vcov(realized_non)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_non)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_non)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_non)[2,2]), 
                   sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_non)[2,2]),
                   sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_non)[2,2])))))

se2=(as.data.frame(c(sqrt(vcov(realized_refugee)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_refugee)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_refugee)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_refugee)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_refugee)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_refugee)[2,2]),
              sqrt(vcov(realized_non)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model1, second_stage=RR_model1_non)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model2, second_stage=RR_model2_non)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model3, second_stage=RR_model3_non)[2,2]), 
              sqrt(Get_ValidVCov(first_stage=model4, second_stage=RR_model4_non)[2,2]),
              sqrt(Get_ValidVCov(first_stage=model5, second_stage=RR_model5_non)[2,2]))))


write.csv(se1,"H:/Zheng_10223/ToVet/Output/se1_incomepredict.csv")
write.csv(se2,"H:/Zheng_10223/ToVet/Output/se2_incomepredict.csv")

# N
Nout=(as.data.frame(c(nobs(realized_refugee),nobs(RR_model1_refugee),nobs(RR_model2_refugee),nobs(RR_model3_refugee),nobs(RR_model4_refugee),nobs(RR_model5_refugee),
                   nobs(realized_non),nobs(RR_model1_non),nobs(RR_model2_non),nobs(RR_model3_non),nobs(RR_model4_non),nobs(RR_model5_non))))


write.csv(Nout,"H:/Zheng_10223/ToVet/Output/Nout_incomepredict.csv")

vcov=Get_ValidVCov(first_stage=model1, second_stage=RR_model1_refugee)


write.csv(vcov,"H:/Zheng_10223/ToVet/Output/incomepredict_vcov.csv")

#############################################################################
# See who underplaces: using Model 1 
dfcohort$predict_realized_diffpct=dfcohort$PredictedLogIncome_pct_model1-dfcohort$MainParent_Income_HH_MainParentAge45_49_pctparent

# Heterogeneity by who underplaces 
dfcohort$WORLD_AREA_BIRTH=relevel(factor(dfcohort$WORLD_AREA_BIRTH), ref="Africa and Middle East")

# World area of Birth
model1=lm(predict_realized_diffpct ~ factor(WORLD_AREA_BIRTH), data=dfcohort[dfcohort$ImmigrationCategory=="Refugee",])

# Language
dfcohort$language=0
dfcohort$language[(dfcohort$AnyEnglish_Main==1|dfcohort$AnyFrench_Main==1)]=1

model2=lm(predict_realized_diffpct ~ factor(language),data=dfcohort[dfcohort$ImmigrationCategory=="Refugee",])

# Parental landing age
dfcohort$LandingAgeMain_cat=cut(dfcohort$LandingageMain, breaks=c(18,30,45,68), labels=c("18-30","30-45","45-68"))

model3=lm(predict_realized_diffpct ~ factor(LandingAgeMain_cat), data=dfcohort[dfcohort$ImmigrationCategory=="Refugee",])

# All covariates 

model4=lm(predict_realized_diffpct ~ factor(LandingAgeMain_cat) + factor(language) + factor(WORLD_AREA_BIRTH), data=dfcohort[dfcohort$ImmigrationCategory=="Refugee",])




# World area of Birth
model5=lm(predict_realized_diffpct ~ factor(WORLD_AREA_BIRTH), data=dfcohort[dfcohort$ImmigrationCategory!="Refugee",])

# Language
dfcohort$language=0
dfcohort$language[(dfcohort$AnyEnglish_Main==1|dfcohort$AnyFrench_Main==1)]=1

model6=lm(predict_realized_diffpct ~ factor(language),data=dfcohort[dfcohort$ImmigrationCategory!="Refugee",])

# Parental landing age
dfcohort$LandingAgeMain_cat=cut(dfcohort$LandingageMain, breaks=c(18,30,45,68), labels=c("18-30","30-45","45-68"))

model7=lm(predict_realized_diffpct ~ factor(LandingAgeMain_cat), data=dfcohort[dfcohort$ImmigrationCategory!="Refugee",])

# All covariates 

model8=lm(predict_realized_diffpct ~ factor(LandingAgeMain_cat) + factor(language) + factor(WORLD_AREA_BIRTH), data=dfcohort[dfcohort$ImmigrationCategory!="Refugee",])


stargazer(model1,model2,model3, model4,add.lines =list( c("Sample","Refugees", "Refugees", "Refugees", "Refugees")), out="H:/Zheng_10223/ToVet/Table_underplacemodel1_het_unrounded_part1.txt")
stargazer( model5,model6,model7,model8,add.lines =list( c("Sample","Non-Refugees", "Non-Refugees","Non-Refugees", "Non-Refugees")), out="H:/Zheng_10223/ToVet/Table_underplacemodel1_het_unrounded_part2.txt")


stargazer(model1,model2,model3, model4,add.lines =list( c("Sample","Refugees", "Refugees", "Refugees", "Refugees")), out="H:/Zheng_10223/ToVet/Table_underplacemodel1_het_unrounded_part1.tex")
stargazer( model5,model6,model7,model8,add.lines =list( c("Sample","Non-Refugees", "Non-Refugees","Non-Refugees", "Non-Refugees")), out="H:/Zheng_10223/ToVet/Table_underplacemodel1_het_unrounded_part2.tex")

source("IncomePrediction_Plots.R")

